perm filename NEWHYF.TMP[TEX,DEK] blob
sn#554098 filedate 1981-01-05 generic text, type T, neo UTF8
comment Hyphenation (word division) routines.
The following procedure is pretty much independent of the rest of TEX and
can be omitted on first reading;
internaldef excepsize=337,sufsize=116,prefsize=109,btabsize=30
# hyphenation table sizes;
internal saf integer array exceptable[0:excepsize-1]
# ordered hash table for exceptional words;
internal saf integer array excephyph[1:excepsize-1]
# corresponding hyphenation patterns;
internal saf integer array suffix[0:sufsize-1] # interpretive commands for suffixes;
internal saf integer array prefix[0:prefsize-1] # interpretive cmnds for prefixes;
internal saf integer array btable[2:btabsize+1] # consonant-pair exception table;
procedure hyphenate(integer p,n,dhyphen) # insert discretionary hyphens;
begin comment This procedure puts discretionary hyphen node of the specified
kind into the linked list of n ≥ 5 letters starting in mem[p]. (Kern nodes
might be between letters of this list.) The auxiliary tables for hyphenation
are built in TEXPRE.
This routine was developed jointly by D. E. Knuth and F. M. Liang;
integer u,q,r,b,c,h,i,j,t,pc;
integer finale # location of final "e" when the suffix routine starts
(temporarily set to 999999 if the suffix "ed" was just removed);
boolean firsttime;
label hashloop,phase2,sufbegin,interps,falsexx,marksuf,restarts,phase3,checkc,
restartp,interpp,marki,phase4c,vowelscan,phase4v,phase4vc,ertest,phase5,hashsearch;
comment People who don't like go to statements should not read this;
define o(c)=⊂"c" land '37⊃ # five-bit version of ascii character c;
u←getnode(n+2) # Get consecutive locations for convenient working back and forth;
q←p # prepare to store the given letters in the sequential list;
for i←u+1 thru u+n do
begin mem[i]←info(q) land '37 # store five bits of character;
q←link(q); if type(q)=kernnode then q←link(q);
end;
finale←1000000 # infinity;
comment The main part of this procedure (from now up to Phase 5) works entirely
in the sequential list just formed. Assuming that
mem[u]=0, mem[u+i]=a[i] for 1≤i≤n, mem[u+n+1]=0,
this procedure hyphenates the word a[1]...a[n] by setting mem[u+i]←0 when
a hyphen comes just before a[i], using TEX's hyphenation algorithm;
comment Phase 1. Search exception dictionary (an ordered hash table);
j← 7 min n;
hashsearch: t←mem[u+1];
for i←u+2 thru u+j do t←(t lsh 5)+mem[i];
h←t mod excepsize;
hashloop: while exceptable[h]>t do h←h-1;
if exceptable[h]≠t then
begin if h then
begin if j≠n or mem[u+n]≠o(s) then go to phase2;
j←j-1; go to hashsearch;
end;
h←excepsize-1; go to hashloop;
end;
comment Now the first 7 letters have been found in exceptable[h].
The corresponding hyphenation pattern appears in excephyph[h], but it
may be necessary to check more than seven letters to make sure the exception
applies. Additional letters to check appear at the righthand side of
excephyph[h], in a straightforward manner exhibited by the following code;
t←excephyph[h];
while t land '37 do
begin comment must check another letter;
j←j+1;
if mem[u+j]≠t land '37 then go to phase2;
t←t lsh -5;
end;
t←excephyph[h] land(flag ash(2-n)) # leftmost n-1 bits;
i←u+3;
while t do
begin if t<0 then mem[i]←0;
t←t lsh 1; i←i+1;
end;
go to phase5;
comment Phase 2. Interpretive routine for suffix removal.
The array suffix contains a "program" for a machine with the following
architecture. Instruction words have four fields, namely opcode, truex,
falsex, operand, each 9 bits. There are two registers: the program counter pc and
the character position i. There is also a toggle called firsttime.
Initially i=u+n-1, pc=mem[u+n], firsttime=true.
(Thus we begin by branching on the final character, mem[u+n].) The opcodes
are as follows, using t to stand for the operand field of the instruction:
scan. If mem[i]=t, decrease i by 1 and go to truex, else go to falsex.
double. Analogous, but tests if mem[i]=mem[i-1].
table. Analogous, but tests if mem[i]εsuffix[t], where xεy means that
word y shifted left x bits has a leading 1 bit.
check. Analogous, but tests if i>u+t and does not decrease i.
success. Sets mem[i+t+1]←0, stops.
fail. Stops.
repeat. Sets mem[i+t+1]←0, firsttime←false, i←i+t-1, pc←mem[i+1]. Thus,
the suffix routine is re-entered before the present suffix.*
again. If firsttime, sets firsttime←false, i←u+n-2, pc←mem[i+1]. Thus,
the suffix routine is re-entered with the final character omitted.*
Otherwise goes to truex.
mark. If t>0 or firsttime, sets mem[i+t+1]←0. Then goes to truex.
efail. (Special routine used to omit "ed".) If mem[u+n]="d" and
mem[u+n-1]="e", sets mem[u+n-1]←0, i←u+n-3, pc←mem[u+n-2]. Otherwise stops.
* Actually the suffix routine is reentered only when i≥u+3;
define opcodes=9,opcoded=27,truexs=9,truexd=18,falsexs=9,falsexd=9,oprands=9,
oprandd=0 # fields in interpreted instructions;
comment the above uses the fact that bitsperwd=36, much smaller fields would work;
define scan=0,double=1,table=2,check=3,success=4,fail=5,repeat=6,again=7,
mark=8,efail=9 # numeric equivalents of symbolic opcodes;
phase2: i←u+n-1; firsttime←true;
sufbegin: pc←mem[i+1]; if pc=o(e) then finale←i+1
else if finale=999999 then finale←i+2 else finale←1000000;
interps: case field(opcode,t←suffix[pc]) of begin
[scan] if(mem[i] xor t) land '37 then go to falsexx else i←i-1;
[double]if mem[i]≠mem[i-1] then go to falsexx else i←i-1;
[table] if(suffix[field(oprand,t)]lsh mem[i])≥0 then go to falsexx else i←i-1;
[check] if i≤u+field(oprand,t) then go to falsexx;
[success] begin mem[i+field(oprand,t)+1]←0; go to phase3 end;
[fail] go to phase3;
[repeat] begin i←i+field(oprand,t)-1; go to marksuf end;
[again] if firsttime then begin i←u+n-2; go to restarts end;
[mark] if (j←field(oprand,t)) or firsttime then mem[i+j+1]←0;
[efail] if mem[u+n]=o(d) and mem[u+n-1]=o(e) then
begin i←u+n-3; finale←999999; go to marksuf;
end
else go to phase3;
else confusion
end;
pc←field(truex,t); go to interps;
falsexx: pc←field(falsex,t); go to interps;
marksuf: mem[i+2]←0;
restarts: firsttime←false; if i≥u+3 then go to sufbegin;
comment Phase 3. Interpretive routine for prefix removal.
The array prefix contains a "program" for a machine with the following
architecture. Instruction words have four fields, namely opcode, truex,
falsex, operand, each 9 bits. There are two registers: the program counter pc and
the character position i. Initially i=u+2 and pc=mem[u+1].
(Thus we begin by branching on the first character, mem[u+1].) The opcodes
are as follows, using t to stand for the operand field of the instruction:
scan. If mem[i]=t, increase i by 1 and go to truex, else go to falsex.
repeat. Set i←i-t. If mem[i+1]=0, stop, otherwise set pc←mem[i],
mem[i]←0, i←i+1.
mark. If t>0 then set mem[i-t]←0. Also remember the value of mem[i],
for phase 4, then set mem[i]←0 (unless mem[i+1]=0) and stop.
table. If mem[i]ε(bit-pattern specified in truex,falsex,oprand fields)
then do a mark 0, otherwise just stop.
fail. Stop.
vow,cons. Stop.
Actually there are four flavors of stopping: One (vow) goes to phase 4 assuming
that mem[i-1] is a vowel, another (cons) goes to phase 4 with mem[i-1] ignored,
the third (fail) omits phase 4 entirely, the last (table when unsuccessful)
goes to phase 4 restarting at the beginning of the word;
define vow=success, cons=again # numeric versions of new opcodes;
phase3: pc←mem[u+1]; i←u+2;
restartp: c←pc; j←i-1;
interpp: case field(opcode,t←prefix[pc]) of begin
[scan]if(mem[i] xor t)land '37 then begin pc←field(falsex,t); go to interpp end
else begin i←i+1; pc←field(truex,t); go to interpp end;
[repeat] begin i←i-field(oprand,t)+1; if mem[i]=0 then go to phase5;
pc←mem[i-1]; mem[i-1]←0; go to restartp end;
[mark] begin if t←field(oprand,t) then mem[i-t]←0; go to marki end;
[table] if t lsh(mem[i]+opcodes)<0 then go to marki
else begin i←j; go to vowelscan end;
[fail] go to phase5;
[vow] go to phase4v;
[cons] go to phase4c;
else confusion
end;
comment Phase 4. This phase implements the consonant-pairs rule for middle
of words, as explained in the TEX writeup. Basically there are a few
special rules for double consonants and combining ch, gh, ph, sh, th into
single consonants, and then there are exceptional pairs of consonants between
which we will not break. There are two classes of exceptions, strong (like bl)
and weak (like ft). The necessary information is packed in btable, whose
words consist of three fields:
hchar specifies code for this character followed by letter h
weak specifies address of "weak" exception table for this character
leading 26 bits, give "strong"∨"weak" exception table
In order to keep hchar and weak to 3-bit fields, their values are encoded in
a straightforward manner that can be deduced by reading the following code;
define hchars=3,hchard=0,weaks=3,weakd=hchars # definition of btable fields;
marki: comment Now mark a permissible hyphen in mem[i] and do phase4 scanning;
if mem[i+1]=0 then go to phase5 # we don't allow only one letter between pref,suf;
if mem[i+1]=o(e) and mem[i+2]=o(d) and mem[i+3]=0 then go to phase5 # don't allow
syllable of form -<consonant>ed-;
c←mem[i]; mem[i]←0; go to vowelscan;
phase4c: c←mem[i];
vowelscan: comment We're looking for a vowel. Now c contains the letter
originally in mem[i], and suffix[0] is a table of vowels (including the null
code 0 as a vowel);
i←i+1; if(suffix[0] lsh c)≥0 then go to phase4c;
checkc: comment Now c is 0 if we've gone too far, else we've found a vowel;
if c=0 then go to phase5;
phase4v: b←mem[i]; i←i+1; if(suffix[0]lsh b)<0 then begin c←b;go to checkc;end;
comment Now b=mem[i-1] is a consonant following a vowel;
phase4vc: c←mem[i];
if b=o(q) and c=o(u) then begin i←i-1; go to marki end;
if(suffix[0] lsh c)<0 then begin i←i+1; go to checkc end;
if b=c then
begin comment double consonant;
if c≠o(l) and c≠o(s) then go to marki else
begin comment ll or ss, check for vowel;
if (c←mem[i+1])=0 then go to phase5;
if(suffix[0]lsh c)<0 then go to ertest;
i←i+2; go to phase4c;
end;
end
else if c=o(h) and j←field(hchar,btable[b]) then
begin comment change ch→e,gh→i,ph→o,sh→u,th→y;
b←b+j-2; i←i+1; go to phase4vc;
end
else if c=o(k) and b=o(c) then begin i←i+1; go to marki end;
if mem[i+1]=o(h) and j←field(hchar,btable[c]) then
begin comment change ch→e, etc., in second consonant position;
c←c+j-2; j←i+2;
end
else j←i+1 # Now j points to where we want a vowel;
if mem[j]=0 then go to phase5;
if(suffix[0] lsh mem[j])<0 then
begin comment vowel-consonant-consonant-vowel found;
if(btable[b] lsh (c-1))≥0 then go to marki # not an exception;
if(btable[field(weak,btable[b])+26] lsh(c-1))≥0 then
begin comment a strong exception;
i←j+1; go to phase4v;
end;
comment a weak exception; i←j-1;
if ((mem[i+1]=o(a) and mem[i+2]=o(g) and finale=i+3)
or (mem[i+1]=o(e) and mem[i+2]=o(s) and mem[i+3]=o(t)))
and mem[i+4]=0 then go to phase5 else go to ertest;
end;
comment three consonants in a row found;
i←j+1; go to phase4c;
ertest: if mem[i+1]=o(e) and mem[i+2]=o(r) and mem[i+3]=0
then go to phase5 else go to marki;
comment Phase 5. We're almost done! Although previous phases may have set mem[u+2]
or mem[u+n-1] or mem[u+n] to zero, we simply ignore this fact as we
output the answer;
phase5: q←link(p);
for i←u+3 thru u+n-2 do
begin if type(q)=kernnode then q←link(q);
comment if i=u+k, the variable q now points to a[k-1];
if mem[i] or (i+2≥finale and i≤finale) then q←link(q) else
begin getavail(r); t←link(q); mem[r]←dhyphen+t;
setlink(q,r); q←t;
end;
end;
freenode(u,n+2);
end;